{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import pickle\n",
    "import matplotlib as plt\n",
    "from itertools import combinations\n",
    "import re"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "All data and preprocessing results are stored under `.data/`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_path='data/' \n",
    "exp_id='v0'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a code mapping dictionary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "codes={}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Drug's target"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load drug's target (from CTDbase)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "gene_drug=pd.read_csv(data_path+'CTD/drug-gene-CTD_C000657245_ixns_20200703223915.tsv', sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Chemical Name</th>\n",
       "      <th>Chemical ID</th>\n",
       "      <th>CAS RN</th>\n",
       "      <th>Gene Symbol</th>\n",
       "      <th>Gene ID</th>\n",
       "      <th>Interaction</th>\n",
       "      <th>Interaction Actions</th>\n",
       "      <th>Reference Count</th>\n",
       "      <th>Organism Count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Sirolimus</td>\n",
       "      <td>D020123</td>\n",
       "      <td>53123-88-9</td>\n",
       "      <td>A2M</td>\n",
       "      <td>2</td>\n",
       "      <td>Sirolimus results in increased expression of A...</td>\n",
       "      <td>increases^expression</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Ivermectin</td>\n",
       "      <td>D007559</td>\n",
       "      <td>70288-86-7</td>\n",
       "      <td>A2ML1.L</td>\n",
       "      <td>100189580</td>\n",
       "      <td>Ivermectin results in increased expression of ...</td>\n",
       "      <td>increases^expression</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>baricitinib</td>\n",
       "      <td>C000596027</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAK1</td>\n",
       "      <td>22848</td>\n",
       "      <td>baricitinib results in decreased activity of A...</td>\n",
       "      <td>decreases^activity</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Choline</td>\n",
       "      <td>D002794</td>\n",
       "      <td>62-49-7</td>\n",
       "      <td>AARD</td>\n",
       "      <td>441376</td>\n",
       "      <td>[Methionine deficiency co-treated with Choline...</td>\n",
       "      <td>affects^cotreatment|decreases^methylation</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Ivermectin</td>\n",
       "      <td>D007559</td>\n",
       "      <td>70288-86-7</td>\n",
       "      <td>AASS.L</td>\n",
       "      <td>444409</td>\n",
       "      <td>Ivermectin results in decreased expression of ...</td>\n",
       "      <td>decreases^expression</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Chemical Name Chemical ID      CAS RN Gene Symbol    Gene ID  \\\n",
       "0     Sirolimus     D020123  53123-88-9         A2M          2   \n",
       "1    Ivermectin     D007559  70288-86-7     A2ML1.L  100189580   \n",
       "2   baricitinib  C000596027         NaN        AAK1      22848   \n",
       "3       Choline     D002794     62-49-7        AARD     441376   \n",
       "4    Ivermectin     D007559  70288-86-7      AASS.L     444409   \n",
       "\n",
       "                                         Interaction  \\\n",
       "0  Sirolimus results in increased expression of A...   \n",
       "1  Ivermectin results in increased expression of ...   \n",
       "2  baricitinib results in decreased activity of A...   \n",
       "3  [Methionine deficiency co-treated with Choline...   \n",
       "4  Ivermectin results in decreased expression of ...   \n",
       "\n",
       "                         Interaction Actions  Reference Count  Organism Count  \n",
       "0                       increases^expression                1               1  \n",
       "1                       increases^expression                1               1  \n",
       "2                         decreases^activity                1               1  \n",
       "3  affects^cotreatment|decreases^methylation                1               1  \n",
       "4                       decreases^expression                1               1  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gene_drug.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Save the drug name and MeSH ID mapping for later use"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "codes['drugname2mesh']={row[0].upper():row[1] for idx, row in gene_drug[['Chemical Name','Chemical ID']].drop_duplicates().iterrows()}\n",
    "codes['mesh2drugname']={row[0].upper():row[1] for idx, row in gene_drug[['Chemical ID','Chemical Name']].drop_duplicates().iterrows()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "#gene_drug=gene_drug.loc[gene_drug['Gene Symbol'].isin(codes['gene_symbol2id'].keys())]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "gene_drug=gene_drug[['Gene ID', 'Chemical Name']].drop_duplicates()\n",
    "gene_drug['Chemical Name']=gene_drug['Chemical Name'].apply(lambda x: x.upper() if type(x)==str else x)\n",
    "\n",
    "gene_drug['Gene ID']=gene_drug['Gene ID'].apply(lambda x: 'gene_'+str(x))\n",
    "gene_drug['Chemical Name']=gene_drug['Chemical Name'].apply(lambda x: 'drug_'+x)\n",
    "\n",
    "gene_drug.drop_duplicates(inplace=True)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pathways"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load pathways from CTDbase"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "pathways=pd.read_csv(data_path+'CTD/pathways-CTD_C000657245_pathways_20200703225429.tsv',sep='\\t')\n",
    "path_sim=pd.concat([pd.DataFrame(list(combinations(pathway,2,)),columns=['gene1','gene2']) for pathway in pathways['Association inferred via'].apply(lambda x: x.split('|') if '|' in x else None).dropna().values]).drop_duplicates()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load pathways from KEGG. The files were preprocessed as a pairwise gene"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_sim_kegg=pd.read_csv(data_path+'biology-database/KegglinkevaluationPPPN_1', header=None,sep='\\t')\n",
    "path_sim_kegg.columns=['gene1', 'gene2', 'positive']\n",
    "path_sim_kegg.replace('PP', 1, inplace=True)\n",
    "path_sim_kegg.replace('PN', 0, inplace=True)\n",
    "\n",
    "path_sim_kegg=path_sim_kegg.loc[path_sim_kegg['positive']==1, ['gene1','gene2']]\n",
    "\n",
    "gene_name =pd.read_excel(data_path+'biology-database/All_Human_Protein_Coding_Genes_3_27_2020.xlsx')\n",
    "gene_dict= {row['Gene Id']:row['Gene Symbol'] for _, row in gene_name[['Gene Id','Gene Symbol']].iterrows()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gene1</th>\n",
       "      <th>gene2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10725</td>\n",
       "      <td>7124</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10725</td>\n",
       "      <td>1437</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4772</td>\n",
       "      <td>7124</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4772</td>\n",
       "      <td>1437</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4773</td>\n",
       "      <td>7124</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29503</th>\n",
       "      <td>8733</td>\n",
       "      <td>84720</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29504</th>\n",
       "      <td>8733</td>\n",
       "      <td>80055</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29505</th>\n",
       "      <td>23556</td>\n",
       "      <td>55650</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29506</th>\n",
       "      <td>2822</td>\n",
       "      <td>284098</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29507</th>\n",
       "      <td>55650</td>\n",
       "      <td>9488</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>14772 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       gene1   gene2\n",
       "0      10725    7124\n",
       "1      10725    1437\n",
       "2       4772    7124\n",
       "3       4772    1437\n",
       "4       4773    7124\n",
       "...      ...     ...\n",
       "29503   8733   84720\n",
       "29504   8733   80055\n",
       "29505  23556   55650\n",
       "29506   2822  284098\n",
       "29507  55650    9488\n",
       "\n",
       "[14772 rows x 2 columns]"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "path_sim_kegg.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_sim_kegg['gene1']=path_sim_kegg['gene1'].apply(lambda x: gene_dict.get(x))\n",
    "path_sim_kegg['gene2']=path_sim_kegg['gene2'].apply(lambda x: gene_dict.get(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_sim_kegg.dropna(inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load PHARMAKB [pathways](https://www.pharmgkb.org/page/COVID)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "pathway_ace_inhibitor=list(set(['ATP6AP2', 'MAPK1', 'AGTR2', 'ATP6AP2', 'REN', 'MAS1', 'TGFB1', 'MAPK3', 'ATP6AP2', 'MAPK3',\n",
    "                       'AGTR1','TGFB1', 'MAPK1', 'NOS3', 'BDKRB2', 'BDKRB2', 'BDKRB1', 'NR3C2', 'CYP11B2', 'AGTR1', 'CYP11B2', 'AGTR1', 'AGT', 'KNG1', 'CYP11B2', 'ACE']))\n",
    "pathway_fluv=['CYP1A2','CYP2C19','CYP3A']\n",
    "pathway_losartan=list(set(['AGTR1','CYP2C9',\"CYP3A4\",'CYP2C9',\"CYP3A4\",'CYP2C9',\"CYP3A4\", 'CYP2C9',\"CYP3A4\", 'UGT1A1',\"UGT2B7\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_sim=pd.concat([path_sim]+[path_sim_kegg]+[pd.DataFrame(list(combinations(pathway,2,)),columns=['gene1','gene2']) for pathway in [pathway_ace_inhibitor,pathway_fluv,pathway_losartan]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_sim['gene1']=path_sim['gene1'].apply(lambda x: codes['gene_symbol2id'].get(x))\n",
    "path_sim['gene2']=path_sim['gene2'].apply(lambda x: codes['gene_symbol2id'].get(x))\n",
    "path_sim.dropna(inplace=True)\n",
    "path_sim['gene1']=path_sim['gene1'].apply(lambda x: 'gene_'+str(int(x)))\n",
    "path_sim['gene2']=path_sim['gene2'].apply(lambda x: 'gene_'+str(int(x)))\n",
    "path_sim.drop_duplicates(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "13423"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(path_sim)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Phenotypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load phenotypes from CTDbase"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "phenotypes=pd.read_csv(data_path+'CTD/phenotype-drug-gene-CTD_C000657245_diseases_20200703224649.tsv', sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Phenotype Term Name</th>\n",
       "      <th>Phenotype Term ID</th>\n",
       "      <th>Disease Name</th>\n",
       "      <th>Disease ID</th>\n",
       "      <th>Chemical Inference Network</th>\n",
       "      <th>Gene Inference Network</th>\n",
       "      <th>References</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>cell proliferation</td>\n",
       "      <td>GO:0008283</td>\n",
       "      <td>COVID-19</td>\n",
       "      <td>MESH:C000657245</td>\n",
       "      <td>Azithromycin|betulinic acid|cepharanthine|Chlo...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>162</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>apoptotic process</td>\n",
       "      <td>GO:0006915</td>\n",
       "      <td>COVID-19</td>\n",
       "      <td>MESH:C000657245</td>\n",
       "      <td>Acetazolamide|Azithromycin|betulinic acid|ceph...</td>\n",
       "      <td>BTK|IL1B|IL2RA|TNF</td>\n",
       "      <td>290</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>viral life cycle</td>\n",
       "      <td>GO:0019058</td>\n",
       "      <td>COVID-19</td>\n",
       "      <td>MESH:C000657245</td>\n",
       "      <td>(2-tert-butoxy-1-(2-cyclohexyl-1-(1-formyl-2-(...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>48</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>cell death</td>\n",
       "      <td>GO:0008219</td>\n",
       "      <td>COVID-19</td>\n",
       "      <td>MESH:C000657245</td>\n",
       "      <td>betulinic acid|Chloroquine|Dactinomycin|Iverme...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>positive regulation of cell death</td>\n",
       "      <td>GO:0010942</td>\n",
       "      <td>COVID-19</td>\n",
       "      <td>MESH:C000657245</td>\n",
       "      <td>Active Hexose Correlated Compound|betulinic ac...</td>\n",
       "      <td>IL1B</td>\n",
       "      <td>40</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 Phenotype Term Name Phenotype Term ID Disease Name  \\\n",
       "0                 cell proliferation        GO:0008283     COVID-19   \n",
       "1                  apoptotic process        GO:0006915     COVID-19   \n",
       "2                   viral life cycle        GO:0019058     COVID-19   \n",
       "3                         cell death        GO:0008219     COVID-19   \n",
       "4  positive regulation of cell death        GO:0010942     COVID-19   \n",
       "\n",
       "        Disease ID                         Chemical Inference Network  \\\n",
       "0  MESH:C000657245  Azithromycin|betulinic acid|cepharanthine|Chlo...   \n",
       "1  MESH:C000657245  Acetazolamide|Azithromycin|betulinic acid|ceph...   \n",
       "2  MESH:C000657245  (2-tert-butoxy-1-(2-cyclohexyl-1-(1-formyl-2-(...   \n",
       "3  MESH:C000657245  betulinic acid|Chloroquine|Dactinomycin|Iverme...   \n",
       "4  MESH:C000657245  Active Hexose Correlated Compound|betulinic ac...   \n",
       "\n",
       "  Gene Inference Network  References  \n",
       "0                    NaN         162  \n",
       "1     BTK|IL1B|IL2RA|TNF         290  \n",
       "2                    NaN          48  \n",
       "3                    NaN         101  \n",
       "4                   IL1B          40  "
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "phenotypes.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "codes['phenotype_id_to_name']={row[0]:row[1] for idx, row in phenotypes[['Phenotype Term ID','Phenotype Term Name']].drop_duplicates().iterrows()}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Drugs and phenotypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "drug_phenotype=phenotypes['Chemical Inference Network'].dropna().apply(lambda x: x.split('|')).apply(pd.Series).merge(phenotypes['Phenotype Term ID'],left_index=True, right_index=True).melt(id_vars=['Phenotype Term ID'],value_name='drug').drop('variable', axis=1).dropna()\n",
    "drug_phenotype['drug']=drug_phenotype['drug'].apply(lambda x: x.upper())\n",
    "drug_phenotype.dropna(inplace=True)\n",
    "\n",
    "drug_phenotype['Phenotype Term ID']=drug_phenotype['Phenotype Term ID'].apply(lambda x: 'phenotype_'+x)\n",
    "drug_phenotype['drug']=drug_phenotype['drug'].apply(lambda x: 'drug_'+x)\n",
    "drug_phenotype=drug_phenotype[['drug', 'Phenotype Term ID']]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Genes and phenotypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "gene_phenotype=phenotypes['Gene Inference Network'].dropna().apply(lambda x: x.split('|')).apply(pd.Series).merge(phenotypes['Phenotype Term ID'],left_index=True, right_index=True).melt(id_vars=['Phenotype Term ID'],value_name='gene').drop('variable', axis=1).dropna()\n",
    "gene_phenotype['Phenotype Term ID']=gene_phenotype['Phenotype Term ID'].apply(lambda x: 'phenotype_'+x)\n",
    "\n",
    "gene_phenotype['gene']=gene_phenotype['gene'].apply(lambda x: codes['gene_symbol2id'].get(x))\n",
    "gene_phenotype.dropna(inplace=True)\n",
    "gene_phenotype['gene']=gene_phenotype['gene'].apply(lambda x: 'gene_'+str(int(x)))\n",
    "gene_phenotype=gene_phenotype[['gene', 'Phenotype Term ID']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "31"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(gene_drug['Chemical Name'].values).intersection(set(drug_phenotype['drug'].values)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "36"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(drug_phenotype['drug'].values))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Baits and Prey"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load SARS-CoV-2 baits and host gene interaction from [Gorden et al. Nature 2020](https://www.nature.com/articles/s41586-020-2286-9#Sec36)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "baits_prey=pd.read_csv(data_path+'biology-database/baits-prey-mist.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "baits_prey=baits_prey[['Bait', 'PreyGene']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(27, 332)"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "baits_prey['Bait'].nunique(), baits_prey['PreyGene'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "baits_prey['Bait']=baits_prey['Bait'].apply(lambda x: 'bait_'+x)\n",
    "baits_prey['PreyGene']=baits_prey['PreyGene'].apply(lambda x: codes['gene_symbol2id'].get(x))\n",
    "baits_prey.dropna(inplace=True)\n",
    "baits_prey['PreyGene']=baits_prey['PreyGene'].apply(lambda x: 'gene_'+str(int(x)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4427, 1762, 330, 18)"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#size of drug target, pathway, host gene, phenotype-related genes\n",
    "gene_drug['Gene ID'].nunique(),len(set(path_sim[['gene1','gene2']].values.ravel())),len(set(baits_prey['PreyGene'].unique())), gene_phenotype['gene'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "94"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#number of intersection between host genes and drug target\n",
    "len(set(baits_prey['PreyGene'].unique()).intersection(gene_drug['Gene ID'].unique()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "733"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#intersection between target and pathways\n",
    "len(set(gene_drug['Gene ID'].unique()).intersection(set(path_sim[['gene1','gene2']].values.ravel())))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "25"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#intersection between pathways and host gene\n",
    "len(set(baits_prey['PreyGene'].unique()).intersection(set(path_sim[['gene1','gene2']].values.ravel())))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10"
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#intersection between pathways, target genes, host gene\n",
    "len(set(baits_prey['PreyGene'].unique()).intersection(set(path_sim[['gene1','gene2']].values.ravel())).intersection(set(gene_drug['Gene ID'].unique()) ))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Item2idx mapping"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Map all the entities (drugs, genes, phenotypes, baits) to ID"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "gene_drug.columns=['node1', 'node2']# gene, drug\n",
    "path_sim.columns=['node1', 'node2']# gene1, gene2\n",
    "baits_prey.columns=['node1','node2']#bait, preygene\n",
    "gene_phenotype.columns=['node1', 'node2'] #gene, phenotype\n",
    "drug_phenotype.columns=['node1','node2']#drug, phenotye"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \"\"\"\n"
     ]
    }
   ],
   "source": [
    "gene_drug['type']='gene-drug'\n",
    "path_sim['type']='gene-gene'\n",
    "baits_prey['type']='bait-gene'\n",
    "gene_phenotype['type']='gene-phenotype'\n",
    "drug_phenotype['type']='drug-phenotype'\n",
    "\n",
    "edge_index=pd.concat([gene_drug, path_sim, baits_prey, gene_phenotype, drug_phenotype])\n",
    "\n",
    "edge_index['node1']=edge_index['node1'].astype(str)\n",
    "edge_index['node2']=edge_index['node2'].astype(str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Label Encoders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LabelEncoder()"
      ]
     },
     "execution_count": 118,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "le=LabelEncoder()\n",
    "le.fit(np.concatenate((edge_index['node1'], edge_index['node2'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "edge_index['node1']=le.transform(edge_index['node1'])\n",
    "edge_index['node2']=le.transform(edge_index['node2'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10624"
      ]
     },
     "execution_count": 120,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(le.classes_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import pre-trained embedding"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Obtain pre-trained entity embedding from [DRKG](https://github.com/gnn4dr/DRKG)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!wget https://dgl-data.s3-us-west-2.amazonaws.com/dataset/DRKG/drkg.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Get pretrained embedding\n",
    "entity_emb=np.load(data_path+'DRKG/embed/DRKG_TransE_l2_entity.npy')\n",
    "emb_size=entity_emb.shape[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [],
   "source": [
    "entity_idmap_file = data_path+'DRKG/embed/entities.tsv'\n",
    "relation_idmap_file = data_path+'DRKG/embed/relations.tsv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
    "baits_drkg=['Disease::'+entity.split('_')[1] for entity  in le.classes_ if entity.split('_')[0]=='bait']\n",
    "gene_drkg = ['Gene::'+entity.split('_')[1] for entity in le.classes_ if entity.split('_')[0]=='gene']\n",
    "phenotype_drkg=['Biological Process::'+entity.split('_')[1] for entity in le.classes_ if entity.split('_')[0]=='phenotype']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Map the DRKG's DrugBank ID to MeSH ID"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [],
   "source": [
    "drugname2external=pd.concat([pd.read_csv(data_path+'CTD/alldrugbank.csv', index_col=0).rename(columns={'drugbank_id':'id'}),\n",
    "           pd.read_csv(data_path+'CTD/nondrugbank.csv', index_col=0).rename(columns={'chembl':'id'})]).groupby('drugname', as_index=False).first()\n",
    "drugname2id={row[0].upper():row[1] for _, row in drugname2external[['drugname', 'id']].iterrows()}\n",
    "drug_drkg = ['Compound::'+drugname2id.get(entity.split('_')[1],'') for entity in le.classes_ if entity.split('_')[0]=='drug']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get drugname/disease name to entity ID mappings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "\n",
    "entity_map = {}\n",
    "entity_id_map = {}\n",
    "relation_map = {}\n",
    "with open(entity_idmap_file, newline='', encoding='utf-8') as csvfile:\n",
    "    reader = csv.DictReader(csvfile, delimiter='\\t', fieldnames=['name','id'])\n",
    "    for row_val in reader:\n",
    "        entity_map[row_val['name']] = int(row_val['id'])\n",
    "        entity_id_map[int(row_val['id'])] = row_val['name']\n",
    "        \n",
    "with open(relation_idmap_file, newline='', encoding='utf-8') as csvfile:\n",
    "    reader = csv.DictReader(csvfile, delimiter='\\t', fieldnames=['name','id'])\n",
    "    for row_val in reader:\n",
    "        relation_map[row_val['name']] = int(row_val['id'])\n",
    "        \n",
    "# handle the ID mapping\n",
    "bait_ids = []\n",
    "gene_ids = []\n",
    "drug_ids = []\n",
    "phenotype_ids = []\n",
    "\n",
    "    \n",
    "for bait in baits_drkg:\n",
    "    bait_ids.append(entity_map.get(bait))\n",
    "\n",
    "for gene in gene_drkg:\n",
    "    gene_ids.append(entity_map.get(gene))\n",
    "    \n",
    "for drug in drug_drkg:\n",
    "    drug_ids.append(entity_map.get(drug))\n",
    "    \n",
    "for phenotype in phenotype_drkg:\n",
    "    phenotype_ids.append(entity_map.get(phenotype))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [],
   "source": [
    "bait_emb=np.array([entity_emb[bait_id] if bait_id is not None else np.zeros(emb_size) for bait_id in bait_ids ])\n",
    "drug_emb=np.array([entity_emb[drug_id] if drug_id is not None else np.zeros(emb_size) for drug_id in drug_ids ])\n",
    "gene_emb=np.array([entity_emb[gene_id] if gene_id is not None else np.zeros(emb_size) for gene_id in gene_ids ])\n",
    "phenotype_emb=np.array([entity_emb[phenotype_id] if phenotype_id is not None else np.zeros(emb_size) for phenotype_id in phenotype_ids ])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3635, 1138)"
      ]
     },
     "execution_count": 129,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#How many missing in drugs?\n",
    "len(drug_ids),len([gene_id for gene_id in drug_ids if gene_id is not None])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5677, 5666)"
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#How many missing in genes?\n",
    "len(gene_ids),len([gene_id for gene_id in gene_ids if gene_id is not None])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1285, 970)"
      ]
     },
     "execution_count": 131,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#How many missing in phenotypes?\n",
    "len(phenotype_ids),len([gene_id for gene_id in phenotype_ids if gene_id is not None])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The pre-trained embedding is now serived as a node feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [],
   "source": [
    "node_features=np.concatenate((bait_emb, drug_emb, gene_emb, phenotype_emb))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save as pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [],
   "source": [
    "edge_index.to_pickle(data_path+'edge_index_'+exp_id+'.pkl')\n",
    "pickle.dump(le, open(data_path+'LabelEncoder_'+exp_id+'.pkl','wb'))\n",
    "pickle.dump(node_features, open(data_path+'node_feature_'+exp_id+'.pkl', 'wb'))\n",
    "pickle.dump(codes, open(data_path+'codes_'+exp_id+'.pkl','wb'))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
